{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "b1c1ebaa-50de-4851-a720-acbb977551ea",
   "metadata": {},
   "source": [
    "# Recency Filtering\n",
    "\n",
    "Showcase capabilities of recency-weighted node postprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92d06b38-2103-4a40-93c3-60e0708a1124",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jerryliu/Programming/llama_index/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
    "from llama_index.postprocessor import (\n",
    "    FixedRecencyPostprocessor,\n",
    "    EmbeddingRecencyPostprocessor,\n",
    ")\n",
    "from llama_index.text_splitter import SentenceSplitter\n",
    "from llama_index.storage.docstore import SimpleDocumentStore\n",
    "from llama_index.response.notebook_utils import display_response"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "67020156-2975-4bbb-8e98-afc55abb3d72",
   "metadata": {},
   "source": [
    "### Parse Documents into Nodes, add to Docstore\n",
    "\n",
    "In this example, there are 3 different versions of PG's essay. They are largely identical **except** \n",
    "for one specific section, which details the amount of funding they raised for Viaweb. \n",
    "\n",
    "V1: 50k, V2: 30k, V3: 10K\n",
    "\n",
    "V1: 2020-01-01, V2: 2020-02-03, V3: 2022-04-12\n",
    "\n",
    "The idea is to encourage index to fetch the most recent info (which is V3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "caddd84e-9827-40a4-9520-dba6405fd1fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load documents\n",
    "from llama_index.storage.storage_context import StorageContext\n",
    "\n",
    "\n",
    "def get_file_metadata(file_name: str):\n",
    "    \"\"\"Get file metadata.\"\"\"\n",
    "    if \"v1\" in file_name:\n",
    "        return {\"date\": \"2020-01-01\"}\n",
    "    elif \"v2\" in file_name:\n",
    "        return {\"date\": \"2020-02-03\"}\n",
    "    elif \"v3\" in file_name:\n",
    "        return {\"date\": \"2022-04-12\"}\n",
    "    else:\n",
    "        raise ValueError(\"invalid file\")\n",
    "\n",
    "\n",
    "documents = SimpleDirectoryReader(\n",
    "    input_files=[\n",
    "        \"test_versioned_data/paul_graham_essay_v1.txt\",\n",
    "        \"test_versioned_data/paul_graham_essay_v2.txt\",\n",
    "        \"test_versioned_data/paul_graham_essay_v3.txt\",\n",
    "    ],\n",
    "    file_metadata=get_file_metadata,\n",
    ").load_data()\n",
    "\n",
    "# define service context (wrapper container around current classes)\n",
    "text_splitter = SentenceSplitter(chunk_size=512)\n",
    "service_context = ServiceContext.from_defaults(text_splitter=text_splitter)\n",
    "\n",
    "# use node parser to parse into nodes\n",
    "nodes = text_splitter.get_nodes_from_documents(documents)\n",
    "\n",
    "# add to docstore\n",
    "docstore = SimpleDocumentStore()\n",
    "docstore.add_documents(nodes)\n",
    "\n",
    "storage_context = StorageContext.from_defaults(docstore=docstore)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "191ced40-80f4-40e7-bf31-0c9a5a664cf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(documents[2].get_text())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5a25b95-de5e-4e56-a846-51e9c6eba181",
   "metadata": {},
   "source": [
    "### Build Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f7f68d6-2389-4f6c-bc4e-8612a1a53fb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 84471 tokens\n"
     ]
    }
   ],
   "source": [
    "# build index\n",
    "index = VectorStoreIndex(nodes, storage_context=storage_context)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86c5e8aa-18d8-4229-b7b2-a1c97c11a09a",
   "metadata": {},
   "source": [
    "### Define Recency Postprocessors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba5e10c9-5a7e-4ea8-a74d-0e0f74b5cd1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_postprocessor = FixedRecencyPostprocessor(service_context=service_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94f44f2b-d816-43a0-87dc-ea8eefc7d534",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_postprocessor_emb = EmbeddingRecencyPostprocessor(\n",
    "    service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efcfffe4-a8aa-486d-b46d-f73f985dffca",
   "metadata": {},
   "source": [
    "### Query Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78d6c3db-61e6-4d9a-a84d-d7be846b4112",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1813 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens\n"
     ]
    }
   ],
   "source": [
    "# naive query\n",
    "\n",
    "query_engine = index.as_query_engine(\n",
    "    similarity_top_k=3,\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"How much did the author raise in seed funding from Idelle's husband\"\n",
    "    \" (Julian) for Viaweb?\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d672c52-c0ac-4e5f-9175-855e66eb97ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# query using fixed recency node postprocessor\n",
    "\n",
    "query_engine = index.as_query_engine(\n",
    "    similarity_top_k=3, node_postprocessors=[node_postprocessor]\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"How much did the author raise in seed funding from Idelle's husband\"\n",
    "    \" (Julian) for Viaweb?\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc1328c1-23b2-406c-b80b-6d97bffc33ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens\n"
     ]
    }
   ],
   "source": [
    "# query using embedding-based node postprocessor\n",
    "\n",
    "query_engine = index.as_query_engine(\n",
    "    similarity_top_k=3, node_postprocessors=[node_postprocessor_emb]\n",
    ")\n",
    "response = query_engine.query(\n",
    "    \"How much did the author raise in seed funding from Idelle's husband\"\n",
    "    \" (Julian) for Viaweb?\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd00cc97-4de7-4c61-9c0c-3f9ee3598528",
   "metadata": {},
   "source": [
    "### Query Index (Lower-Level Usage)\n",
    "\n",
    "In this example we first get the full set of nodes from a query call, and then send to node postprocessor, and then\n",
    "finally synthesize response through a summary index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "350b039e-d45d-4b6b-957a-4b14d8816cbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import SummaryIndex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "234f909f-6faa-43e6-96f8-0966699c9552",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_str = (\n",
    "    \"How much did the author raise in seed funding from Idelle's husband\"\n",
    "    \" (Julian) for Viaweb?\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20afbf6b-9473-446e-b522-b90fef2e3bf0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 22 tokens\n"
     ]
    }
   ],
   "source": [
    "query_engine = index.as_query_engine(\n",
    "    similarity_top_k=3, response_mode=\"no_text\"\n",
    ")\n",
    "init_response = query_engine.query(\n",
    "    query_str,\n",
    ")\n",
    "resp_nodes = [n.node for n in init_response.source_nodes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdc03574-a806-4255-953c-6f82fc3f202f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 541 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n"
     ]
    }
   ],
   "source": [
    "summary_index = SummaryIndex(resp_nodes)\n",
    "query_engine = summary_index.as_query_engine(\n",
    "    node_postprocessors=[node_postprocessor]\n",
    ")\n",
    "response = query_engine.query(query_str)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama_index",
   "language": "python",
   "name": "llama_index"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
